{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Visualize Amazon Customer Reviews Dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dataset columns:\n",
    "\n",
    "- `marketplace`: 2-letter country code (in this case all \"US\").\n",
    "- `customer_id`: Random identifier that can be used to aggregate reviews written by a single author.\n",
    "- `review_id`: A unique ID for the review.\n",
    "- `product_id`: The Amazon Standard Identification Number (ASIN).  `http://www.amazon.com/dp/<ASIN>` links to the product's detail page.\n",
    "- `product_parent`: The parent of that ASIN.  Multiple ASINs (color or format variations of the same product) can roll up into a single parent.\n",
    "- `product_title`: Title description of the product.\n",
    "- `product_category`: Broad product category that can be used to group reviews (in this case digital videos).\n",
    "- `star_rating`: The review's rating (1 to 5 stars).\n",
    "- `helpful_votes`: Number of helpful votes for the review.\n",
    "- `total_votes`: Number of total votes the review received.\n",
    "- `vine`: Was the review written as part of the [Vine](https://www.amazon.com/gp/vine/help) program?\n",
    "- `verified_purchase`: Was the review from a verified purchase?\n",
    "- `review_headline`: The title of the review itself.\n",
    "- `review_body`: The text of the review.\n",
    "- `review_date`: The date the review was written."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%bash\n",
    "pip install -q --upgrade pip\n",
    "pip install -q pandas==0.23.0\n",
    "pip install -q numpy==1.14.3\n",
    "pip install -q matplotlib==3.0.3\n",
    "pip install -q seaborn==0.8.1\n",
    "pip install -q PyAthena==1.8.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Imports & Settings\n",
    "\n",
    "import boto3\n",
    "import botocore\n",
    "import sagemaker\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format='retina'\n",
    "\n",
    "# Get region\n",
    "session = boto3.session.Session()\n",
    "region_name = session.region_name\n",
    "\n",
    "# Get SageMaker session & default S3 bucket\n",
    "sagemaker_session = sagemaker.Session()\n",
    "bucket = sagemaker_session.default_bucket()\n",
    "\n",
    "# Set Athena database & table\n",
    "database_name = \"dsoaws\"\n",
    "table_name = \"amazon_reviews_parquet\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## List all Product Categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# PyAthena imports\n",
    "from pyathena import connect\n",
    "from pyathena.pandas_cursor import PandasCursor\n",
    "from pyathena.util import as_pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set S3 staging directory -- this is a temporary directory used for Athena queries\n",
    "s3_staging_dir = \"s3://{0}/athena/staging\".format(bucket)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Execute query using connection cursor\n",
    "cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n",
    "\n",
    "cursor.execute(\n",
    "    \"SELECT DISTINCT product_category \\\n",
    "                FROM {0}.{1} \\\n",
    "                ORDER BY product_category\".format(\n",
    "        database_name, table_name\n",
    "    )\n",
    ")\n",
    "\n",
    "# Load query results into Pandas DataFrame and show results\n",
    "df_categories = as_pandas(cursor)\n",
    "df_categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Store number of categories\n",
    "num_categories = df_categories.shape[0]\n",
    "print(num_categories)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Ratings By Product Category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Execute query using connection cursor\n",
    "cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n",
    "\n",
    "cursor.execute(\n",
    "    \"SELECT product_category, \\\n",
    "                    COUNT(star_rating) AS count_star_rating \\\n",
    "                FROM {0}.{1} \\\n",
    "                GROUP BY product_category \\\n",
    "                ORDER BY count_star_rating DESC\".format(\n",
    "        database_name, table_name\n",
    "    )\n",
    ")\n",
    "\n",
    "# Load query results into Pandas DataFrame and show results\n",
    "df_star_ratings = as_pandas(cursor)\n",
    "df_star_ratings.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Store max ratings\n",
    "max_ratings = df_star_ratings[\"count_star_rating\"].max()\n",
    "print(max_ratings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set size and style to use\n",
    "if num_categories > 10:\n",
    "    plt.figure(figsize=(10, 10))\n",
    "else:\n",
    "    plt.figure(figsize=(10, 5))\n",
    "\n",
    "plt.style.use(\"seaborn-whitegrid\")\n",
    "\n",
    "# Create Seaborn barplot\n",
    "barplot = sns.barplot(y=\"product_category\", x=\"count_star_rating\", data=df_star_ratings, saturation=1)\n",
    "\n",
    "# Set title\n",
    "plt.title(\"Number of Ratings per Product Category\")\n",
    "\n",
    "# Set x-axis ticks to match scale\n",
    "if max_ratings > 200000:\n",
    "    plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], [\"100K\", \"1m\", \"5m\", \"10m\", \"15m\", \"20m\"])\n",
    "    plt.xlim(0, 20000000)\n",
    "elif max_ratings <= 200000:\n",
    "    plt.xticks([50000, 100000, 150000, 200000], [\"50K\", \"100K\", \"1500K\", \"200K\"])\n",
    "    plt.xlim(0, 200000)\n",
    "\n",
    "plt.xlabel(\"Number of Ratings\")\n",
    "plt.ylabel(\"Product Category\")\n",
    "\n",
    "plt.tight_layout()\n",
    "\n",
    "# Export plot if needed\n",
    "# plt.savefig('ratings_per_category.png', dpi=300)\n",
    "\n",
    "# Show the barplot\n",
    "plt.show(barplot)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Average Rating by Product Category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Execute query using connection cursor\n",
    "cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n",
    "\n",
    "cursor.execute(\n",
    "    \"SELECT product_category, \\\n",
    "                    AVG(star_rating) AS avg_star_rating \\\n",
    "                FROM {0}.{1} \\\n",
    "                GROUP BY product_category \\\n",
    "                ORDER BY avg_star_rating DESC\".format(\n",
    "        database_name, table_name\n",
    "    )\n",
    ")\n",
    "\n",
    "# Load query results into Pandas DataFrame and show results\n",
    "df_average_ratings = as_pandas(cursor)\n",
    "df_average_ratings.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set some Seaborn parameters in advance\n",
    "sns.set_style = \"seaborn-whitegrid\"\n",
    "\n",
    "sns.set(\n",
    "    rc={\n",
    "        \"font.style\": \"normal\",\n",
    "        #            \"axes.facecolor\":\"white\",\n",
    "        \"figure.facecolor\": \"white\",\n",
    "        \"figure.titlesize\": 20,\n",
    "        \"text.color\": \"black\",\n",
    "        \"xtick.color\": \"black\",\n",
    "        \"ytick.color\": \"black\",\n",
    "        \"axes.labelcolor\": \"black\",\n",
    "        \"axes.grid\": True,\n",
    "        \"axes.labelsize\": 10,\n",
    "        #           'figure.figsize':(10.0, 10.0),\n",
    "        \"xtick.labelsize\": 10,\n",
    "        \"font.size\": 10,\n",
    "        \"ytick.labelsize\": 10,\n",
    "    }\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Helper code to display values on bars\n",
    "\n",
    "\n",
    "def show_values_barplot(axs, space):\n",
    "    def _show_on_plot(ax):\n",
    "        for p in ax.patches:\n",
    "            _x = p.get_x() + p.get_width() + float(space)\n",
    "            _y = p.get_y() + p.get_height()\n",
    "            value = round(float(p.get_width()), 2)\n",
    "            ax.text(_x, _y, value, ha=\"left\")\n",
    "\n",
    "    if isinstance(axs, np.ndarray):\n",
    "        for idx, ax in np.ndenumerate(axs):\n",
    "            _show_on_plot(ax)\n",
    "    else:\n",
    "        _show_on_plot(axs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot average ratings per category\n",
    "\n",
    "# Create plot\n",
    "barplot = sns.barplot(y=\"product_category\", x=\"avg_star_rating\", data=df_average_ratings, saturation=1)\n",
    "\n",
    "# Set title and x-axis ticks\n",
    "plt.title(\"Average Rating by Product Category\")\n",
    "plt.xticks([1, 2, 3, 4, 5], [\"1-Star\", \"2-Star\", \"3-Star\", \"4-Star\", \"5-Star\"])\n",
    "\n",
    "# Helper code to show actual values afters bars\n",
    "show_values_barplot(barplot, 0.1)\n",
    "\n",
    "plt.xlabel(\"Average Rating\")\n",
    "plt.ylabel(\"Product Category\")\n",
    "\n",
    "# Export plot if needed\n",
    "# plt.tight_layout()\n",
    "# plt.savefig('avg_ratings_per_category.png', dpi=300)\n",
    "\n",
    "# Show graphic\n",
    "plt.show(barplot)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Rating Breakdown by Product Category"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### First, calculate standard deviation and square root of number of reviews"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Execute query using connection cursor\n",
    "cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n",
    "\n",
    "cursor.execute(\n",
    "    \"SELECT product_category, \\\n",
    "                         AVG(star_rating) AS avg_star_rating, \\\n",
    "                         STDDEV(star_rating) AS stddev_star_rating, \\\n",
    "                         SQRT(COUNT(*)) AS sqrt_count \\\n",
    "                FROM {}.{} \\\n",
    "                GROUP BY product_category \\\n",
    "                ORDER BY avg_star_rating DESC\".format(\n",
    "        database_name, table_name\n",
    "    )\n",
    ")\n",
    "\n",
    "# Load query results into Pandas DataFrame and show results\n",
    "df_avg_stddev_sqrt = as_pandas(cursor)\n",
    "df_avg_stddev_sqrt.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Then, calculate standard deviation mean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Execute query using connection cursor\n",
    "cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n",
    "\n",
    "cursor.execute(\n",
    "    \"SELECT product_category, \\\n",
    "                         AVG(star_rating) AS avg_star_rating, \\\n",
    "                         (STDDEV(star_rating) / SQRT(COUNT(*))) AS sd_mean \\\n",
    "                FROM {}.{} \\\n",
    "                GROUP BY product_category \\\n",
    "                ORDER BY avg_star_rating DESC\".format(\n",
    "        database_name, table_name\n",
    "    )\n",
    ")\n",
    "\n",
    "# Load query results into Pandas DataFrame and show results\n",
    "df_breakdown_category_avg = as_pandas(cursor)\n",
    "df_breakdown_category_avg.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Number of reviews per star per category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Execute query using connection cursor\n",
    "cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n",
    "\n",
    "cursor.execute(\n",
    "    \"SELECT product_category, \\\n",
    "                         star_rating, \\\n",
    "                         COUNT(*) AS count_reviews \\\n",
    "                FROM {}.{} \\\n",
    "                GROUP BY  product_category, star_rating \\\n",
    "                ORDER BY  product_category, star_rating ASC, count_reviews DESC\".format(\n",
    "        database_name, table_name\n",
    "    )\n",
    ")\n",
    "\n",
    "# Load query results into Pandas DataFrame and show results\n",
    "df_breakdown_category = as_pandas(cursor)\n",
    "df_breakdown_category.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create grouped DataFrames by category and by star rating\n",
    "grouped_category = df_breakdown_category.groupby(\"product_category\")\n",
    "grouped_star = df_breakdown_category.groupby(\"star_rating\")\n",
    "\n",
    "# Create sum of ratings per star rating\n",
    "df_sum = df_breakdown_category.groupby([\"star_rating\"]).sum()\n",
    "df_sum.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate total number of star ratings\n",
    "total = df_sum[\"count_reviews\"].sum()\n",
    "print(total)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create dictionary of product categories and array of star rating distribution per category\n",
    "\n",
    "distribution = {}\n",
    "count_reviews_per_star = []\n",
    "i = 0\n",
    "\n",
    "for category, ratings in grouped_category:\n",
    "    count_reviews_per_star = []\n",
    "    for star in ratings[\"star_rating\"]:\n",
    "        count_reviews_per_star.append(ratings.get_value(i, \"count_reviews\"))\n",
    "        i = i + 1\n",
    "    distribution[category] = count_reviews_per_star\n",
    "\n",
    "# Check if distribution has been created succesfully\n",
    "print(distribution)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check if distribution keys are set correctly to product categories\n",
    "print(distribution.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check if star rating distributions are set correctly\n",
    "print(distribution.items())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Build array per star across all categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sort distribution by highest average rating per category\n",
    "sorted_distribution = {}\n",
    "\n",
    "df_average_ratings.iloc[:, 0]\n",
    "for index, value in df_average_ratings.iloc[:, 0].items():\n",
    "    sorted_distribution[value] = distribution[value]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Build array per star across all categories\n",
    "star1 = []\n",
    "star2 = []\n",
    "star3 = []\n",
    "star4 = []\n",
    "star5 = []\n",
    "\n",
    "for k in sorted_distribution.keys():\n",
    "    stars = sorted_distribution.get(k)\n",
    "    star1.append(stars[0])\n",
    "    star2.append(stars[1])\n",
    "    star3.append(stars[2])\n",
    "    star4.append(stars[3])\n",
    "    star5.append(stars[4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot the distributions of star ratings per product category\n",
    "\n",
    "categories = sorted_distribution.keys()\n",
    "\n",
    "total = np.array(star1) + np.array(star2) + np.array(star3) + np.array(star4) + np.array(star5)\n",
    "\n",
    "proportion_star1 = np.true_divide(star1, total) * 100\n",
    "proportion_star2 = np.true_divide(star2, total) * 100\n",
    "proportion_star3 = np.true_divide(star3, total) * 100\n",
    "proportion_star4 = np.true_divide(star4, total) * 100\n",
    "proportion_star5 = np.true_divide(star5, total) * 100\n",
    "\n",
    "# Add colors\n",
    "colors = [\"red\", \"purple\", \"blue\", \"orange\", \"green\"]\n",
    "\n",
    "# The position of the bars on the x-axis\n",
    "r = range(len(categories))\n",
    "barHeight = 1\n",
    "\n",
    "# Plot bars\n",
    "if num_categories > 10:\n",
    "    plt.figure(figsize=(10, 10))\n",
    "else:\n",
    "    plt.figure(figsize=(10, 5))\n",
    "\n",
    "ax5 = plt.barh(r, proportion_star5, color=colors[4], edgecolor=\"white\", height=barHeight, label=\"5-Star Ratings\")\n",
    "ax4 = plt.barh(\n",
    "    r,\n",
    "    proportion_star4,\n",
    "    left=proportion_star5,\n",
    "    color=colors[3],\n",
    "    edgecolor=\"white\",\n",
    "    height=barHeight,\n",
    "    label=\"4-Star Ratings\",\n",
    ")\n",
    "ax3 = plt.barh(\n",
    "    r,\n",
    "    proportion_star3,\n",
    "    left=proportion_star5 + proportion_star4,\n",
    "    color=colors[2],\n",
    "    edgecolor=\"white\",\n",
    "    height=barHeight,\n",
    "    label=\"3-Star Ratings\",\n",
    ")\n",
    "ax2 = plt.barh(\n",
    "    r,\n",
    "    proportion_star2,\n",
    "    left=proportion_star5 + proportion_star4 + proportion_star3,\n",
    "    color=colors[1],\n",
    "    edgecolor=\"white\",\n",
    "    height=barHeight,\n",
    "    label=\"2-Star Ratings\",\n",
    ")\n",
    "ax1 = plt.barh(\n",
    "    r,\n",
    "    proportion_star1,\n",
    "    left=proportion_star5 + proportion_star4 + proportion_star3 + proportion_star2,\n",
    "    color=colors[0],\n",
    "    edgecolor=\"white\",\n",
    "    height=barHeight,\n",
    "    label=\"1-Star Ratings\",\n",
    ")\n",
    "\n",
    "plt.title(\"Distribution of Reviews Per Rating Per Category\", fontsize=\"16\")\n",
    "plt.legend(bbox_to_anchor=(1.04, 1), loc=\"upper left\")\n",
    "plt.yticks(r, categories, fontweight=\"bold\")\n",
    "\n",
    "plt.xlabel(\"% Breakdown of Star Ratings\", fontsize=\"14\")\n",
    "plt.gca().invert_yaxis()\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Translate Ratings into Sentiment"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Attention**: the SQL query below can take a long time on large datasets, hence set a LIMIT if you run it from within the notebook, or execute the query in the Athena Console directly. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Execute query using connection cursor\n",
    "cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n",
    "\n",
    "# If rating > 3, sentiment = 1 (positive), else 0 (negative)\n",
    "cursor.execute(\n",
    "    \"SELECT customer_id, \\\n",
    "                         product_id, \\\n",
    "                         star_rating, \\\n",
    "                         CASE \\\n",
    "                             WHEN star_rating > 3 THEN 1 \\\n",
    "                             ELSE 0 \\\n",
    "                         END \\\n",
    "                AS is_positive_sentiment \\\n",
    "                FROM {}.{} \\\n",
    "                ORDER BY review_id \\\n",
    "                LIMIT 10000\".format(\n",
    "        database_name, table_name\n",
    "    )\n",
    ")\n",
    "\n",
    "# Load query results into Pandas DataFrame and show results\n",
    "df_sentiment = as_pandas(cursor)\n",
    "df_sentiment.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Python [Wordcloud](http://amueller.github.io/word_cloud/) Visualization\n",
    "\n",
    "See the most popular words in the dataset using a Wordcloud visualization.\n",
    "Attention, the SQL query below can take a long time on large datasets, hence set a LIMIT."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m pip install wordcloud -q"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Execute query using connection cursor\n",
    "cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()\n",
    "\n",
    "cursor.execute(\n",
    "    \"SELECT review_body, \\\n",
    "                         CASE \\\n",
    "                             WHEN star_rating > 3 THEN 1 \\\n",
    "                             ELSE 0 \\\n",
    "                         END \\\n",
    "                AS is_positive_sentiment \\\n",
    "                FROM {}.{} \\\n",
    "                ORDER BY review_id \\\n",
    "                LIMIT 10000\".format(\n",
    "        database_name, table_name\n",
    "    )\n",
    ")\n",
    "\n",
    "df_reviews = as_pandas(cursor)\n",
    "df_reviews.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note: To remove HTML markup like the word `br` in the `review_body`, use [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import bs4\n",
    "\n",
    "df_reviews[\"review_body\"] = df_reviews[\"review_body\"].apply(lambda x: bs4.BeautifulSoup(x, \"lxml\").get_text())\n",
    "df_reviews"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from wordcloud import WordCloud, STOPWORDS\n",
    "\n",
    "\n",
    "def plot_wordcloud(\n",
    "    text,\n",
    "    mask=None,\n",
    "    max_words=200,\n",
    "    max_font_size=150,\n",
    "    figure_size=(20.0, 15.0),\n",
    "    title=None,\n",
    "    title_size=40,\n",
    "    image_color=False,\n",
    "):\n",
    "    stopwords = set(STOPWORDS)\n",
    "\n",
    "    wordcloud = WordCloud(\n",
    "        background_color=\"gray\",\n",
    "        stopwords=stopwords,\n",
    "        max_words=max_words,\n",
    "        max_font_size=max_font_size,\n",
    "        random_state=50,\n",
    "        width=800,\n",
    "        height=400,\n",
    "        mask=mask,\n",
    "    )\n",
    "    wordcloud.generate(str(text))\n",
    "\n",
    "    plt.figure(figsize=figure_size)\n",
    "    if image_color:\n",
    "        image_colors = ImageColorGenerator(mask)\n",
    "        plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation=\"bilinear\")\n",
    "        plt.title(title, fontdict={\"size\": title_size, \"verticalalignment\": \"bottom\"})\n",
    "    else:\n",
    "        plt.imshow(wordcloud)\n",
    "        plt.title(title, fontdict={\"size\": title_size, \"color\": \"black\", \"verticalalignment\": \"bottom\"})\n",
    "    plt.axis(\"off\")\n",
    "    plt.tight_layout()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_wordcloud(\n",
    "    df_reviews.query(\"is_positive_sentiment == 0\")[\"review_body\"], title=\"Word Cloud of Negative Amazon Reviews\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_wordcloud(\n",
    "    df_reviews.query(\"is_positive_sentiment == 1\")[\"review_body\"], title=\"Word Cloud of Positive Amazon Reviews\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Let's do some further text analysis\n",
    "* Number of words\n",
    "* Number of unique words\n",
    "* Number of chars in the text\n",
    "* Number of stopwords\n",
    "* Number of punctuations\n",
    "* Average word length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import string\n",
    "\n",
    "df_reviews[\"num_words\"] = df_reviews[\"review_body\"].apply(lambda x: len(str(x).split()))\n",
    "\n",
    "df_reviews[\"num_unique_words\"] = df_reviews[\"review_body\"].apply(lambda x: len(set(str(x).split())))\n",
    "\n",
    "df_reviews[\"num_chars\"] = df_reviews[\"review_body\"].apply(lambda x: len(str(x)))\n",
    "\n",
    "df_reviews[\"num_stopwords\"] = df_reviews[\"review_body\"].apply(\n",
    "    lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS])\n",
    ")\n",
    "\n",
    "df_reviews[\"num_punctuations\"] = df_reviews[\"review_body\"].apply(\n",
    "    lambda x: len([c for c in str(x) if c in string.punctuation])\n",
    ")\n",
    "\n",
    "df_reviews[\"mean_word_len\"] = df_reviews[\"review_body\"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_reviews.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df_reviews.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Truncate extreme values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_reviews = df_reviews.query(\"num_words <= 500 and num_punctuations < 500\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Let's plot number of words, number of characters and number of punctuations in each class using Violin plots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "f, axes = plt.subplots(3, 1, figsize=(10, 20))\n",
    "\n",
    "sns.violinplot(x=\"is_positive_sentiment\", y=\"num_words\", data=df_reviews, ax=axes[0])\n",
    "axes[0].set_xlabel(\"Sentiment\", fontsize=12)\n",
    "axes[0].set_ylabel(\"Number Of Words\", fontsize=12)\n",
    "axes[0].set_title(\"Number Of Words In Each Class\", fontsize=15)\n",
    "\n",
    "sns.violinplot(x=\"is_positive_sentiment\", y=\"num_chars\", data=df_reviews, ax=axes[1])\n",
    "axes[1].set_xlabel(\"Sentiment\", fontsize=12)\n",
    "axes[1].set_ylabel(\"Number Of Characters\", fontsize=12)\n",
    "axes[1].set_title(\"Number Of Characters In Each Class\", fontsize=15)\n",
    "\n",
    "sns.violinplot(x=\"is_positive_sentiment\", y=\"num_punctuations\", data=df_reviews, ax=axes[2])\n",
    "axes[2].set_xlabel(\"Sentiment\", fontsize=12)\n",
    "axes[2].set_ylabel(\"Number Of Punctutations\", fontsize=12)\n",
    "axes[2].set_title(\"Number Of Punctuations In Each Class\", fontsize=15)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Start thinking about balancing positive vs. negative reviews"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Count number of reviews per sentiment class\n",
    "print(df_reviews[\"is_positive_sentiment\"].value_counts())\n",
    "\n",
    "# Create Plot\n",
    "plot = sns.countplot(x=\"is_positive_sentiment\", data=df_reviews)\n",
    "plt.xlabel(\"Sentiment\", fontsize=16)\n",
    "plt.ylabel(\"Number Of Reviews\", fontsize=16)\n",
    "plt.title(\"Number Of Reviews Per Sentiment Class\", fontsize=16)\n",
    "\n",
    "plt.show(plot)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Handling imbalanced datasets\n",
    "\n",
    "Here you can see we have a larger number of `positive` samples vs. `negative` ones. There are a number of techniques to blance this dataset out and the two most popular approaches are to either under-sample or over-sample. With under sampling you remove rows to balance the dataset out and in over sampling you can duplicate entries in the daatset which could lead to overfitting. This discussion is beyond the scope of this lab. You will under sample the data to balance the dataset but you can find more information [here]()."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.utils import resample\n",
    "\n",
    "positive = df_reviews[df_reviews[\"is_positive_sentiment\"] == 1]\n",
    "negative = df_reviews[df_reviews[\"is_positive_sentiment\"] == 0]\n",
    "\n",
    "positive_downsampled = resample(\n",
    "    positive, replace=False, n_samples=len(negative), random_state=27  # sample without replacement  # match minority n\n",
    ")  # reproducible results\n",
    "\n",
    "# combine minority and downsampled majority\n",
    "downsampled = pd.concat([positive_downsampled, negative])\n",
    "\n",
    "# checking counts\n",
    "print(downsampled[\"is_positive_sentiment\"].value_counts())\n",
    "\n",
    "# Create Plot\n",
    "plot = sns.countplot(x=\"is_positive_sentiment\", data=downsampled)\n",
    "plt.xlabel(\"Sentiment\", fontsize=16)\n",
    "plt.ylabel(\"Number Of Reviews\", fontsize=16)\n",
    "plt.title(\"Number Of Reviews Per Sentiment Class\", fontsize=16)\n",
    "\n",
    "plt.show(plot)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create Test, Train, and Validation Datasets\n",
    "\n",
    "Depending on the framework you are leveraging in your AI/ML workloads you may decide to split the data into test, train, and validate splits before uploading to S3. You can leverage some built in functions in the sklearn package to do the split. To learn more about the sklearn framework click [here](https://scikit-learn.org/stable/)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "train, test = train_test_split(downsampled, test_size=0.2, random_state=0)\n",
    "test, validate = train_test_split(test, test_size=0.5, random_state=0)\n",
    "\n",
    "print(f\"Number of training examples: {len(train.index)}\")\n",
    "print(f\"Number of testing examples: {len(test.index)}\")\n",
    "print(f\"Number of validation examples: {len(validate.index)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Visualize the Train, Test, and Validation Split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Pie chart, where the slices will be ordered and plotted counter-clockwise:\n",
    "\n",
    "labels = [\"Train\", \"Validation\", \"Test\"]\n",
    "sizes = [len(train.index), len(validate.index), len(test.index)]\n",
    "explode = (0.1, 0, 0)\n",
    "\n",
    "fig1, ax1 = plt.subplots()\n",
    "\n",
    "ax1.set_title(\"Split Of Train, Validatin And Test Data\", fontsize=16)\n",
    "ax1.pie(sizes, explode=explode, labels=labels, autopct=\"%1.1f%%\", startangle=90, textprops={\"fontsize\": 12})\n",
    "\n",
    "# Equal aspect ratio ensures that pie is drawn as a circle.\n",
    "ax1.axis(\"equal\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Optional - SQL Magic in Jupyter Notebooks\n",
    "\n",
    "You can use the built-in functionality in Jupyter to create shortcut magic commands to fit your needs. Here we will use the pyathena library like above to query the review data, but this time we will encapsulate the call in the Jupyter magic command."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyathena\n",
    "from pyathena.util import as_pandas\n",
    "\n",
    "from IPython.core import magic_arguments\n",
    "from IPython.core.magic import cell_magic, Magics, magics_class\n",
    "\n",
    "\n",
    "def query_athena(sql, region_name, s3_staging_dir):\n",
    "    cursor = pyathena.connect(region_name=region_name, s3_staging_dir=\"{}\".format(s3_staging_dir)).cursor()\n",
    "    cursor.execute(sql)\n",
    "    return cursor\n",
    "\n",
    "\n",
    "@magics_class\n",
    "class AthenaMagics(Magics):\n",
    "    s3_staging_dir = None\n",
    "    region_name = None\n",
    "\n",
    "    def parse_args(self, line):\n",
    "        args = magic_arguments.parse_argstring(self.athena, line)\n",
    "\n",
    "        # s3 staging directory\n",
    "        if args.s3_staging_dir is None and self.s3_staging_dir is None:\n",
    "            raise ValueError(\"s3_staging_dir for Athena should be set\")\n",
    "        if args.s3_staging_dir is not None:\n",
    "            self.s3_staging_dir = args.s3_staging_dir\n",
    "\n",
    "        # region name\n",
    "        if args.region_name is None and self.region_name is None:\n",
    "            raise ValueError(\"region_name for Athena should be set\")\n",
    "        if args.region_name is not None:\n",
    "            self.region_name = args.region_name\n",
    "\n",
    "    @cell_magic\n",
    "    @magic_arguments.magic_arguments()\n",
    "    @magic_arguments.argument(\n",
    "        \"--s3_staging_dir\",\n",
    "        \"-s\",\n",
    "        help=\"s3 path required by athena for writing query results (e.g. s3://your/staging/dir)\",\n",
    "    )\n",
    "    @magic_arguments.argument(\"--region_name\", \"-r\", help=\"aws region name (e.g. us-west-2)\")\n",
    "    def athena(self, line=\"\", cell=None):\n",
    "        self.parse_args(line)\n",
    "        cursor = query_athena(cell, self.region_name, self.s3_staging_dir)\n",
    "        return as_pandas(cursor)\n",
    "\n",
    "\n",
    "ip = get_ipython()\n",
    "ip.register_magics(AthenaMagics)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Use the `%%athena` magic to query data registered in your Glue Data Catalog."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%athena --region_name $region_name --s3_staging_dir $s3_staging_dir\n",
    "\n",
    "select * from dsoaws.amazon_reviews_parquet limit 10;"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
